#Before starting

# Any number can be chosen 
set.seed(238427)

##Testing timing of Rmd for server evaluation

# What time did we start running this script? 
start_time <- Sys.time()
start_time
## [1] "2024-05-02 09:31:04 EDT"

Goals of this file

  1. Use raw fastq files and generate quality plots to assess quality of reads.
  2. Filter and trim out bad sequences and bases from our sequencing files.
  3. Write out fastq files with high quality sequences.
  4. Evaluate the quality from our filter and trim.
  5. Infer Errors on forward and reverse reads individually.
  6. Identified ASVs on forward and reverse reads separately, using the error model.
  7. Merge forward and reverse ASVs into “contiguous ASVs”.
  8. Generate the ASV count table. (otu_table input for phyloseq.).

Output that we need:

  1. ASV Count Table: otu_table
  2. Taxonomy Table tax_table
  3. Sample Information: sample_data track the reads lots throughout DADA2 workflow.

Load Libraries

# Efficient package loading with pacman 
devtools::install_github("thomasp85/patchwork@HEAD")
## Skipping install of 'patchwork' from a github remote, the SHA1 (d9437579) has not changed since last install.
##   Use `force = TRUE` to force installation
pacman::p_load(tidyverse, BiocManager, devtools, dada2, 
               phyloseq, patchwork, DT, iNEXT, vegan,
               install = FALSE)
library(patchwork)

Load Data

# Set the raw fastq path to the raw sequencing files 
# Path to the fastq files 
raw_fastqs_path <- "data/01_DADA2/01_raw_gzipped_fastqs"
raw_fastqs_path
## [1] "data/01_DADA2/01_raw_gzipped_fastqs"
# What files are in this path? Intuition Check 
head(list.files(raw_fastqs_path))
## [1] "SRR11364336_1.fastq.gz" "SRR11364336_2.fastq.gz" "SRR11364337_1.fastq.gz"
## [4] "SRR11364337_2.fastq.gz" "SRR11364338_1.fastq.gz" "SRR11364338_2.fastq.gz"
# How many files are there? 
str(list.files(raw_fastqs_path))
##  chr [1:120] "SRR11364336_1.fastq.gz" "SRR11364336_2.fastq.gz" ...
# Create vector of forward reads
forward_reads <- list.files(raw_fastqs_path, pattern = "1.fastq.gz", full.names = TRUE)  
# Intuition Check 
head(forward_reads)
## [1] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364336_1.fastq.gz"
## [2] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364337_1.fastq.gz"
## [3] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364338_1.fastq.gz"
## [4] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364339_1.fastq.gz"
## [5] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364340_1.fastq.gz"
## [6] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364341_1.fastq.gz"
# Create a vector of reverse reads 
reverse_reads <- list.files(raw_fastqs_path, pattern = "2.fastq.gz", full.names = TRUE)
head(reverse_reads)
## [1] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364336_2.fastq.gz"
## [2] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364337_2.fastq.gz"
## [3] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364338_2.fastq.gz"
## [4] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364339_2.fastq.gz"
## [5] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364340_2.fastq.gz"
## [6] "data/01_DADA2/01_raw_gzipped_fastqs/SRR11364341_2.fastq.gz"

Assess Raw Read Quality

Evaluate raw sequence quality

Let’s see the quality of the raw reads before we trim

Plot 12 random samples of plots

# Randomly select 12 samples from dataset to evaluate 
# Selecting 12 is typically better than 2 (like we did in class for efficiency)
random_samples <- sample(1:length(reverse_reads), size = 12)
random_samples
##  [1] 46 35 56 42 27  3 40 23 21 24 20  5
library("dada2")
library("tidyverse")

# Calculate and plot quality of these two samples
forward_filteredQual_plot_12 <- plotQualityProfile(forward_reads[random_samples]) + 
  labs(title = "Forward Read: Raw Quality")

reverse_filteredQual_plot_12 <- plotQualityProfile(reverse_reads[random_samples]) + 
  labs(title = "Reverse Read: Raw Quality")


# Plot them together with patchwork
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12

Aggregated Raw Quality Plots

# Aggregate all QC plots 
# Forward reads
forward_preQC_plot <- 
  plotQualityProfile(forward_reads, aggregate = TRUE) + 
  labs(title = "Forward Pre-QC")

# reverse reads
reverse_preQC_plot <- 
  plotQualityProfile(reverse_reads, aggregate = TRUE) + 
  labs(title = "Reverse Pre-QC")

preQC_aggregate_plot <- 
  # Plot the forward and reverse together 
  forward_preQC_plot + reverse_preQC_plot

# Show the plot
preQC_aggregate_plot

Interpretation:

The quality is higher at the beginning of the read and slowly gets worse and worse as the read progresses. This is typical of Illumina sequencing because of phasing. The first few bases at the beginning of the forward reads and also in the reverse reads have very low quality bases.

Prepare a placeholder for filtered reads

# vector of our samples, extract sample name from files 
samples <- sapply(strsplit(basename(forward_reads), "_"), `[`,1) 
# Intuition Check 
head(samples)
## [1] "SRR11364336" "SRR11364337" "SRR11364338" "SRR11364339" "SRR11364340"
## [6] "SRR11364341"
# Place filtered reads into filtered_fastqs_path
filtered_fastqs_path <- "data/01_DADA2/02_filtered_fastqs"
filtered_fastqs_path
## [1] "data/01_DADA2/02_filtered_fastqs"
# create 2 variables: filtered_F, filtered_R
filtered_forward_reads <- 
  file.path(filtered_fastqs_path, paste0(samples, "_R1_filtered.fastq.gz"))
length(filtered_forward_reads)
## [1] 60
# reverse reads
filtered_reverse_reads <- 
  file.path(filtered_fastqs_path, paste0(samples, "_R2_filtered.fastq.gz"))
head(filtered_reverse_reads)
## [1] "data/01_DADA2/02_filtered_fastqs/SRR11364336_R2_filtered.fastq.gz"
## [2] "data/01_DADA2/02_filtered_fastqs/SRR11364337_R2_filtered.fastq.gz"
## [3] "data/01_DADA2/02_filtered_fastqs/SRR11364338_R2_filtered.fastq.gz"
## [4] "data/01_DADA2/02_filtered_fastqs/SRR11364339_R2_filtered.fastq.gz"
## [5] "data/01_DADA2/02_filtered_fastqs/SRR11364340_R2_filtered.fastq.gz"
## [6] "data/01_DADA2/02_filtered_fastqs/SRR11364341_R2_filtered.fastq.gz"

Filter and Trim Reads

To pay attention to:

Parameters of filter and trim DEPEND ON THE DATASET. The things to keep in mind are:
- The library preparation: Are the primers included in the sequence? If so, they need to be trimmed out in this step.
- What do the above quality profiles of the reads look like? If they are lower quality, it is highly recommended to use maxEE = c(1,1).
- Do the reads dip suddenly in their quality? If so, explore trimLeft and truncLen

#It was not clear in the paper if sequences were in the sequence which I assume they were.
filtered_reads <-filterAndTrim(forward_reads, filtered_forward_reads,
             reverse_reads, filtered_reverse_reads,
             truncLen = c(290,290), trimLeft = c(19,20),
             maxN = 0, truncQ = 2, 
             rm.phix = TRUE, compress = TRUE, 
             multithread = 5)

Assess Trimmed Read Quality

# Plot the 12 random samples after QC
forward_filteredQual_plot_12 <- 
  plotQualityProfile(filtered_forward_reads[random_samples]) + 
  labs(title = "Trimmed Forward Read Quality")

reverse_filteredQual_plot_12 <- 
  plotQualityProfile(filtered_reverse_reads[random_samples]) + 
  labs(title = "Trimmed Reverse Read Quality")

# Put the two plots together 
forward_filteredQual_plot_12 + reverse_filteredQual_plot_12

Aggregated Trimmed Plots

# Aggregate all QC plots 
# Forward reads
forward_postQC_plot <- 
  plotQualityProfile(filtered_forward_reads, aggregate = TRUE) + 
  labs(title = "Forward Post-QC")

# reverse reads
reverse_postQC_plot <- 
  plotQualityProfile(filtered_reverse_reads, aggregate = TRUE) + 
  labs(title = "Reverse Post-QC")

postQC_aggregate_plot <- 
  # Plot the forward and reverse together 
  forward_postQC_plot + reverse_postQC_plot

# Show the plot
postQC_aggregate_plot

Interpretation:

Here, we see the sequences are improved from before. The very low quality especially in the beginning in the pre-QC plots is mostly gone. If we wanted, we could trim more of the forward sequence.

Stats on read output from filterAndTrim

# Make output into dataframe 
filtered_df <- as.data.frame(filtered_reads)
head(filtered_df)
##                        reads.in reads.out
## SRR11364336_1.fastq.gz    49102     49102
## SRR11364337_1.fastq.gz    59949     59949
## SRR11364338_1.fastq.gz    51272     51272
## SRR11364339_1.fastq.gz    54283     54283
## SRR11364340_1.fastq.gz    41960     41960
## SRR11364341_1.fastq.gz    63891     63891
# calculate some stats 
filtered_df %>%
  reframe(median_reads_in = median(reads.in),
          median_reads_out = median(reads.out),
          median_percent_retained = (median(reads.out)/median(reads.in)))
##   median_reads_in median_reads_out median_percent_retained
## 1           51754            51754                       1

Interpretation:

  • How many reads got through? Is it “enough”? For me the number of reads that got in and out were the same and percent retained was 1. I think it might be more related to maxEE. The dataset was not clear enough with primers. When working on my own dataset I would pay attention to these details.

  • Should you play with the paratmeters in filterAndTrim() more? If so, which parameters? If I were to play with the parameters I would try truncLen and maxEE.

Visualize QC differences in plot

# Plot the pre and post together in one plot
preQC_aggregate_plot / postQC_aggregate_plot

Here, we can see how many total reads were removed from the QC. We also see that the quality is getting better.

Error Modelling

Note every sequencing run needs to be run separately! The error model MUST be run separately on each Illumina dataset. This is because every Illumina run is different, even if the flow cell and DNA/samples are the same. If you’d like to combine the datasets from multiple Illumina sequencing runs, you’ll need to do the exact same filterAndTrim() step AND, very importantly, we’ll need to have the same primer/ASV length/16S location expected by the output.

But wait: what contributes to sequencing error in different sequencing runs and why do we need to model errors separately per run with learnErrors() in dada2? Remember the core principles of how Illumina seuqencing works! Some things that contribute to this are:

-Different timings for when clusters go out of sync (drop in quality at end of reads that’s typical of Illumina sequencing) - The cluster density is impossible to exactly replicate. Therefore, the cluster density (and therefore sequence quality) will always be different between sequencing runs (even if it’s the same person/samples/sequencing facility!). -PhiX spike-in will also vary between runs, even if we try to make it the same! Therefore, the amount of heterogeneity on the flow cell will also be different, impacting the quality.
-Different locations on the flow cell can be impacted differently between runs. Perhaps an air bubble can get in, the cluster density happened to be higher/lower on a different run/flow cell.

Ok, that said. Let’s now infer error rates for all possible transitions within purines and pyrimidines (A<>G or C<>T) and transversions between all purine and pyrimidine combinations. The error model is learned by alternating estimation of the error rates and inference of sample composition until they converge. This specifically:

  1. Starts with the assumption that the error rates are the maximum (takes the most abundant sequence (“center”) and assumes it’s the only sequence not caused by errors).
  2. Compares the other sequences to the most abundant sequence.
  3. Uses at most 108 nucleotides for the error estimation.
  4. Uses parametric error estimation function of loess fit on the observed error rates.

Learn the errors

# Forward reads 
error_forward_reads <- 
  learnErrors(filtered_forward_reads, multithread = 5)
## 104291640 total bases in 384840 reads from 7 samples will be used for learning the error rates.
# Plot Forward  
forward_error_plot <- 
  plotErrors(error_forward_reads, nominalQ = TRUE) + 
  labs(title = "Forward Read Error Model")

# Reverse reads 
error_reverse_reads <- 
  learnErrors(filtered_reverse_reads, multithread = 5)
## 103906800 total bases in 384840 reads from 7 samples will be used for learning the error rates.
# Plot reverse
reverse_error_plot <- 
  plotErrors(error_reverse_reads, nominalQ = TRUE) + 
  labs(title = "Reverse Read Error Model")

# Put the two plots together
forward_error_plot + reverse_error_plot
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.

Interpretation:

The error rates for each possible transition (A→C, A→G, …) are shown in the plot above.

Details of the plot: - Points: The observed error rates for each consensus quality score.
- Black line: Estimated error rates after convergence of the machine-learning algorithm.
- Red line: The error rates expected under the nominal definition of the Q-score.

The estimated error rates (black line) are a “reasonably good” fit to the observed rates (points), and the error rates drop with increased quality.

Infer ASVs

An important note: This process occurs separately on forward and reverse reads! This is quite a different approach from how OTUs are identified in Mothur and also from UCHIME, oligotyping, and other OTU, MED, and ASV approaches.

# Infer ASVs on the forward sequences
dada_forward <- dada(filtered_forward_reads,
                     err = error_forward_reads, 
                     multithread = 5)
## Sample 1 - 49102 reads in 29281 unique sequences.
## Sample 2 - 59949 reads in 32981 unique sequences.
## Sample 3 - 51272 reads in 30455 unique sequences.
## Sample 4 - 54283 reads in 32374 unique sequences.
## Sample 5 - 41960 reads in 24550 unique sequences.
## Sample 6 - 63891 reads in 37060 unique sequences.
## Sample 7 - 64383 reads in 32407 unique sequences.
## Sample 8 - 51800 reads in 29259 unique sequences.
## Sample 9 - 39804 reads in 21799 unique sequences.
## Sample 10 - 56693 reads in 31739 unique sequences.
## Sample 11 - 54348 reads in 28140 unique sequences.
## Sample 12 - 55845 reads in 29480 unique sequences.
## Sample 13 - 44608 reads in 23663 unique sequences.
## Sample 14 - 45230 reads in 24940 unique sequences.
## Sample 15 - 52991 reads in 30382 unique sequences.
## Sample 16 - 57481 reads in 30264 unique sequences.
## Sample 17 - 44410 reads in 22607 unique sequences.
## Sample 18 - 58909 reads in 33269 unique sequences.
## Sample 19 - 58838 reads in 32155 unique sequences.
## Sample 20 - 57179 reads in 31338 unique sequences.
## Sample 21 - 52835 reads in 30687 unique sequences.
## Sample 22 - 70201 reads in 40773 unique sequences.
## Sample 23 - 58712 reads in 30287 unique sequences.
## Sample 24 - 53303 reads in 28142 unique sequences.
## Sample 25 - 42021 reads in 23528 unique sequences.
## Sample 26 - 59789 reads in 33543 unique sequences.
## Sample 27 - 70770 reads in 34846 unique sequences.
## Sample 28 - 44541 reads in 25087 unique sequences.
## Sample 29 - 42920 reads in 21427 unique sequences.
## Sample 30 - 53155 reads in 27523 unique sequences.
## Sample 31 - 48967 reads in 26032 unique sequences.
## Sample 32 - 47997 reads in 26284 unique sequences.
## Sample 33 - 47814 reads in 23348 unique sequences.
## Sample 34 - 44267 reads in 22664 unique sequences.
## Sample 35 - 52528 reads in 29401 unique sequences.
## Sample 36 - 53830 reads in 29498 unique sequences.
## Sample 37 - 46962 reads in 28184 unique sequences.
## Sample 38 - 54456 reads in 29760 unique sequences.
## Sample 39 - 22599 reads in 13429 unique sequences.
## Sample 40 - 42615 reads in 24097 unique sequences.
## Sample 41 - 53562 reads in 28687 unique sequences.
## Sample 42 - 42202 reads in 23896 unique sequences.
## Sample 43 - 43526 reads in 23764 unique sequences.
## Sample 44 - 41993 reads in 21927 unique sequences.
## Sample 45 - 51548 reads in 24233 unique sequences.
## Sample 46 - 38464 reads in 19453 unique sequences.
## Sample 47 - 37371 reads in 22142 unique sequences.
## Sample 48 - 49860 reads in 28507 unique sequences.
## Sample 49 - 56746 reads in 28561 unique sequences.
## Sample 50 - 41871 reads in 22992 unique sequences.
## Sample 51 - 61705 reads in 34123 unique sequences.
## Sample 52 - 55739 reads in 30270 unique sequences.
## Sample 53 - 51708 reads in 29001 unique sequences.
## Sample 54 - 58934 reads in 32429 unique sequences.
## Sample 55 - 53986 reads in 31284 unique sequences.
## Sample 56 - 41780 reads in 24300 unique sequences.
## Sample 57 - 45975 reads in 25141 unique sequences.
## Sample 58 - 52090 reads in 30767 unique sequences.
## Sample 59 - 49075 reads in 29612 unique sequences.
## Sample 60 - 48608 reads in 26491 unique sequences.
typeof(dada_forward)
## [1] "list"
# Grab a sample and look at it 
dada_forward$`SRR11364336_R1_filtered.fastq.gz`
## dada-class: object describing DADA2 denoising results
## 1370 sequence variants were inferred from 29281 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
# Infer ASVs on the reverse sequences 
dada_reverse <- dada(filtered_reverse_reads,
                     err = error_reverse_reads,
                     multithread = 5)
## Sample 1 - 49102 reads in 40702 unique sequences.
## Sample 2 - 59949 reads in 49421 unique sequences.
## Sample 3 - 51272 reads in 42910 unique sequences.
## Sample 4 - 54283 reads in 45459 unique sequences.
## Sample 5 - 41960 reads in 34055 unique sequences.
## Sample 6 - 63891 reads in 50620 unique sequences.
## Sample 7 - 64383 reads in 50825 unique sequences.
## Sample 8 - 51800 reads in 41698 unique sequences.
## Sample 9 - 39804 reads in 32801 unique sequences.
## Sample 10 - 56693 reads in 44508 unique sequences.
## Sample 11 - 54348 reads in 42587 unique sequences.
## Sample 12 - 55845 reads in 43590 unique sequences.
## Sample 13 - 44608 reads in 34616 unique sequences.
## Sample 14 - 45230 reads in 36255 unique sequences.
## Sample 15 - 52991 reads in 42988 unique sequences.
## Sample 16 - 57481 reads in 45099 unique sequences.
## Sample 17 - 44410 reads in 35564 unique sequences.
## Sample 18 - 58909 reads in 46873 unique sequences.
## Sample 19 - 58838 reads in 47063 unique sequences.
## Sample 20 - 57179 reads in 46250 unique sequences.
## Sample 21 - 52835 reads in 43271 unique sequences.
## Sample 22 - 70201 reads in 56382 unique sequences.
## Sample 23 - 58712 reads in 45268 unique sequences.
## Sample 24 - 53303 reads in 42078 unique sequences.
## Sample 25 - 42021 reads in 33502 unique sequences.
## Sample 26 - 59789 reads in 48734 unique sequences.
## Sample 27 - 70770 reads in 57249 unique sequences.
## Sample 28 - 44541 reads in 36726 unique sequences.
## Sample 29 - 42920 reads in 34813 unique sequences.
## Sample 30 - 53155 reads in 41248 unique sequences.
## Sample 31 - 48967 reads in 38940 unique sequences.
## Sample 32 - 47997 reads in 38491 unique sequences.
## Sample 33 - 47814 reads in 38926 unique sequences.
## Sample 34 - 44267 reads in 35936 unique sequences.
## Sample 35 - 52528 reads in 44169 unique sequences.
## Sample 36 - 53830 reads in 43043 unique sequences.
## Sample 37 - 46962 reads in 39087 unique sequences.
## Sample 38 - 54456 reads in 44119 unique sequences.
## Sample 39 - 22599 reads in 19031 unique sequences.
## Sample 40 - 42615 reads in 35155 unique sequences.
## Sample 41 - 53562 reads in 44370 unique sequences.
## Sample 42 - 42202 reads in 35205 unique sequences.
## Sample 43 - 43526 reads in 36741 unique sequences.
## Sample 44 - 41993 reads in 33274 unique sequences.
## Sample 45 - 51548 reads in 41018 unique sequences.
## Sample 46 - 38464 reads in 31276 unique sequences.
## Sample 47 - 37371 reads in 30648 unique sequences.
## Sample 48 - 49860 reads in 42245 unique sequences.
## Sample 49 - 56746 reads in 45554 unique sequences.
## Sample 50 - 41871 reads in 35121 unique sequences.
## Sample 51 - 61705 reads in 49515 unique sequences.
## Sample 52 - 55739 reads in 45179 unique sequences.
## Sample 53 - 51708 reads in 42039 unique sequences.
## Sample 54 - 58934 reads in 48252 unique sequences.
## Sample 55 - 53986 reads in 45348 unique sequences.
## Sample 56 - 41780 reads in 35022 unique sequences.
## Sample 57 - 45975 reads in 38523 unique sequences.
## Sample 58 - 52090 reads in 43271 unique sequences.
## Sample 59 - 49075 reads in 41389 unique sequences.
## Sample 60 - 48608 reads in 40603 unique sequences.
# Inspect 
dada_reverse[1]
## $SRR11364336_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 673 sequence variants were inferred from 40702 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
dada_reverse[30]
## $SRR11364365_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 885 sequence variants were inferred from 41248 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

Merge Forward & Reverse ASVs

Now, merge the forward and reverse ASVs into contigs.

# merge forward and reverse ASVs
merged_ASVs <- mergePairs(dada_forward, filtered_forward_reads, 
                          dada_reverse, filtered_reverse_reads,
                          verbose = TRUE)
## 23454 paired-reads (in 799 unique pairings) successfully merged out of 37713 (in 6040 pairings) input.
## 31060 paired-reads (in 986 unique pairings) successfully merged out of 48214 (in 6692 pairings) input.
## 24011 paired-reads (in 843 unique pairings) successfully merged out of 39201 (in 6540 pairings) input.
## 25753 paired-reads (in 870 unique pairings) successfully merged out of 41636 (in 6597 pairings) input.
## 20194 paired-reads (in 836 unique pairings) successfully merged out of 31807 (in 5274 pairings) input.
## 31614 paired-reads (in 1138 unique pairings) successfully merged out of 49217 (in 8970 pairings) input.
## 36432 paired-reads (in 1256 unique pairings) successfully merged out of 52935 (in 6594 pairings) input.
## 26071 paired-reads (in 993 unique pairings) successfully merged out of 40522 (in 6368 pairings) input.
## 19416 paired-reads (in 779 unique pairings) successfully merged out of 31231 (in 3918 pairings) input.
## 29141 paired-reads (in 1145 unique pairings) successfully merged out of 44731 (in 6877 pairings) input.
## 30206 paired-reads (in 1209 unique pairings) successfully merged out of 44282 (in 5432 pairings) input.
## 30786 paired-reads (in 1184 unique pairings) successfully merged out of 45326 (in 6094 pairings) input.
## 24139 paired-reads (in 937 unique pairings) successfully merged out of 35989 (in 4653 pairings) input.
## 22790 paired-reads (in 827 unique pairings) successfully merged out of 35189 (in 4972 pairings) input.
## 26911 paired-reads (in 978 unique pairings) successfully merged out of 41658 (in 6404 pairings) input.
## 31516 paired-reads (in 1137 unique pairings) successfully merged out of 46719 (in 6386 pairings) input.
## 23475 paired-reads (in 873 unique pairings) successfully merged out of 35832 (in 4010 pairings) input.
## 30467 paired-reads (in 1135 unique pairings) successfully merged out of 46899 (in 7839 pairings) input.
## 30762 paired-reads (in 1179 unique pairings) successfully merged out of 46963 (in 7381 pairings) input.
## 29612 paired-reads (in 1152 unique pairings) successfully merged out of 45504 (in 6870 pairings) input.
## 25038 paired-reads (in 971 unique pairings) successfully merged out of 40192 (in 6937 pairings) input.
## 34348 paired-reads (in 1165 unique pairings) successfully merged out of 53910 (in 9925 pairings) input.
## 32871 paired-reads (in 1211 unique pairings) successfully merged out of 47732 (in 6522 pairings) input.
## 28997 paired-reads (in 1125 unique pairings) successfully merged out of 42978 (in 6043 pairings) input.
## 21967 paired-reads (in 896 unique pairings) successfully merged out of 33165 (in 5088 pairings) input.
## 30990 paired-reads (in 1096 unique pairings) successfully merged out of 47292 (in 6978 pairings) input.
## 40143 paired-reads (in 1239 unique pairings) successfully merged out of 59108 (in 6432 pairings) input.
## 21161 paired-reads (in 768 unique pairings) successfully merged out of 34122 (in 5174 pairings) input.
## 23808 paired-reads (in 867 unique pairings) successfully merged out of 35226 (in 3656 pairings) input.
## 29686 paired-reads (in 1104 unique pairings) successfully merged out of 43347 (in 5589 pairings) input.
## 26374 paired-reads (in 1005 unique pairings) successfully merged out of 39317 (in 4977 pairings) input.
## 24098 paired-reads (in 889 unique pairings) successfully merged out of 38168 (in 5287 pairings) input.
## 27107 paired-reads (in 953 unique pairings) successfully merged out of 39855 (in 3799 pairings) input.
## 23527 paired-reads (in 904 unique pairings) successfully merged out of 35942 (in 4136 pairings) input.
## 25899 paired-reads (in 960 unique pairings) successfully merged out of 40448 (in 5778 pairings) input.
## 28768 paired-reads (in 1069 unique pairings) successfully merged out of 43050 (in 5897 pairings) input.
## 22426 paired-reads (in 785 unique pairings) successfully merged out of 36144 (in 5870 pairings) input.
## 28351 paired-reads (in 1065 unique pairings) successfully merged out of 43054 (in 5965 pairings) input.
## 10376 paired-reads (in 471 unique pairings) successfully merged out of 16584 (in 2037 pairings) input.
## 21781 paired-reads (in 869 unique pairings) successfully merged out of 33521 (in 4911 pairings) input.
## 28072 paired-reads (in 1037 unique pairings) successfully merged out of 42732 (in 5657 pairings) input.
## 21133 paired-reads (in 834 unique pairings) successfully merged out of 32846 (in 4534 pairings) input.
## 21649 paired-reads (in 830 unique pairings) successfully merged out of 33800 (in 4298 pairings) input.
## 23288 paired-reads (in 892 unique pairings) successfully merged out of 33706 (in 4104 pairings) input.
## 30477 paired-reads (in 1080 unique pairings) successfully merged out of 43035 (in 4336 pairings) input.
## 20144 paired-reads (in 750 unique pairings) successfully merged out of 30867 (in 3408 pairings) input.
## 17871 paired-reads (in 683 unique pairings) successfully merged out of 27911 (in 4574 pairings) input.
## 23835 paired-reads (in 783 unique pairings) successfully merged out of 38419 (in 5406 pairings) input.
## 32394 paired-reads (in 1083 unique pairings) successfully merged out of 47384 (in 5258 pairings) input.
## 20921 paired-reads (in 723 unique pairings) successfully merged out of 33001 (in 3867 pairings) input.
## 33314 paired-reads (in 1063 unique pairings) successfully merged out of 49800 (in 7002 pairings) input.
## 29866 paired-reads (in 983 unique pairings) successfully merged out of 44894 (in 5618 pairings) input.
## 26980 paired-reads (in 899 unique pairings) successfully merged out of 41210 (in 5639 pairings) input.
## 31211 paired-reads (in 1018 unique pairings) successfully merged out of 47320 (in 6306 pairings) input.
## 25963 paired-reads (in 842 unique pairings) successfully merged out of 42198 (in 6346 pairings) input.
## 20684 paired-reads (in 681 unique pairings) successfully merged out of 33043 (in 4354 pairings) input.
## 23064 paired-reads (in 768 unique pairings) successfully merged out of 36422 (in 4386 pairings) input.
## 24552 paired-reads (in 874 unique pairings) successfully merged out of 40092 (in 6780 pairings) input.
## 22285 paired-reads (in 728 unique pairings) successfully merged out of 36961 (in 5947 pairings) input.
## 24629 paired-reads (in 847 unique pairings) successfully merged out of 38737 (in 4705 pairings) input.
# Evaluate the output 
typeof(merged_ASVs)
## [1] "list"
length(merged_ASVs)
## [1] 60
names(merged_ASVs)
##  [1] "SRR11364336_R1_filtered.fastq.gz" "SRR11364337_R1_filtered.fastq.gz"
##  [3] "SRR11364338_R1_filtered.fastq.gz" "SRR11364339_R1_filtered.fastq.gz"
##  [5] "SRR11364340_R1_filtered.fastq.gz" "SRR11364341_R1_filtered.fastq.gz"
##  [7] "SRR11364342_R1_filtered.fastq.gz" "SRR11364343_R1_filtered.fastq.gz"
##  [9] "SRR11364344_R1_filtered.fastq.gz" "SRR11364345_R1_filtered.fastq.gz"
## [11] "SRR11364346_R1_filtered.fastq.gz" "SRR11364347_R1_filtered.fastq.gz"
## [13] "SRR11364348_R1_filtered.fastq.gz" "SRR11364349_R1_filtered.fastq.gz"
## [15] "SRR11364350_R1_filtered.fastq.gz" "SRR11364351_R1_filtered.fastq.gz"
## [17] "SRR11364352_R1_filtered.fastq.gz" "SRR11364353_R1_filtered.fastq.gz"
## [19] "SRR11364354_R1_filtered.fastq.gz" "SRR11364355_R1_filtered.fastq.gz"
## [21] "SRR11364356_R1_filtered.fastq.gz" "SRR11364357_R1_filtered.fastq.gz"
## [23] "SRR11364358_R1_filtered.fastq.gz" "SRR11364359_R1_filtered.fastq.gz"
## [25] "SRR11364360_R1_filtered.fastq.gz" "SRR11364361_R1_filtered.fastq.gz"
## [27] "SRR11364362_R1_filtered.fastq.gz" "SRR11364363_R1_filtered.fastq.gz"
## [29] "SRR11364364_R1_filtered.fastq.gz" "SRR11364365_R1_filtered.fastq.gz"
## [31] "SRR11364366_R1_filtered.fastq.gz" "SRR11364367_R1_filtered.fastq.gz"
## [33] "SRR11364368_R1_filtered.fastq.gz" "SRR11364369_R1_filtered.fastq.gz"
## [35] "SRR11364370_R1_filtered.fastq.gz" "SRR11364371_R1_filtered.fastq.gz"
## [37] "SRR11364372_R1_filtered.fastq.gz" "SRR11364373_R1_filtered.fastq.gz"
## [39] "SRR11364374_R1_filtered.fastq.gz" "SRR11364375_R1_filtered.fastq.gz"
## [41] "SRR11364376_R1_filtered.fastq.gz" "SRR11364377_R1_filtered.fastq.gz"
## [43] "SRR11364378_R1_filtered.fastq.gz" "SRR11364379_R1_filtered.fastq.gz"
## [45] "SRR11364380_R1_filtered.fastq.gz" "SRR11364381_R1_filtered.fastq.gz"
## [47] "SRR11364382_R1_filtered.fastq.gz" "SRR11364383_R1_filtered.fastq.gz"
## [49] "SRR11364384_R1_filtered.fastq.gz" "SRR11364385_R1_filtered.fastq.gz"
## [51] "SRR11364386_R1_filtered.fastq.gz" "SRR11364387_R1_filtered.fastq.gz"
## [53] "SRR11364388_R1_filtered.fastq.gz" "SRR11364389_R1_filtered.fastq.gz"
## [55] "SRR11364390_R1_filtered.fastq.gz" "SRR11364391_R1_filtered.fastq.gz"
## [57] "SRR11364392_R1_filtered.fastq.gz" "SRR11364393_R1_filtered.fastq.gz"
## [59] "SRR11364394_R1_filtered.fastq.gz" "SRR11364395_R1_filtered.fastq.gz"
# Inspect the merger data.frame from the 20210602-MA-ABB1P 

head(merged_ASVs[[3]])
##                                                                                                                                                                                                                                                                                                                                                                                                                                 sequence
## 1 GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGAGGGATGACGGCCTTCGGGTTGTAAACCTCTTTCAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGGATTAGA
## 2     GCAGCAGTCGGGAATTTTGCCCAATGGACGAAAGTCTGAGGCAGCAACTCCGCGTGAGGGATGAAGGCCTTCGGGTCGTAAACCTCTTTTCCCAGGGAAGATCCAAGACGGTACCTGGGGAATAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGCGCGCGCAGGCGGCTGGCCAAGTCCGATGTGAAAGCTTCCGGCTTAACTGGAAAACGGCATCGGATACTGGTCGGCTGGAAGGTGGGAGAGGGTAGCGGAATTCCCGGTGTAGTGGTGAAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAAGCGGCTACCTGGCCCACTCTTGACGCTGAGGCGCGAAAGCTAGGGGAGCAAACGGGATTAGA
## 3      GCAGCAGTGGGGAATATTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTCGGGTTGTAAAGCTCTTTTGCCAGGGACGATGATGACGGTACCTGGAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCTTATCAAGTCAGGCGTGAAATTCCCGGGCTCAACCTGGGGGCTGCGCTTGATACTGATGAGCTTGAATGCGGGAGAGGATAGTGGAATTCCCAGTGTAGAGGTGAAATTCGTAGATATTGGGAAGAACACCGGTGGCGAAGGCGGCTATCTGGCCCGTAATTGACGCTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## 4      GCAGCAGTCGGGAATTTTGCGCAATGGACGAAAGTCTGACGCAGCAACGCCGCGTGAGGGATGACGGCCTTCGGGTTGTAAACCTCTTTTGTCAGGGACGATGATGACGGTACCTGACGAATAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGAGCGCGCAGGCGGTCGTTCAAGTCGCGTGTGAAAGCCCCCGGCTCAACTGGGGAGGGTCACGCGATACTGATCGACTCGAAGGCAGGAGAGGGTAGTGGAATTCCCGGTGTAGTGGTGAAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGACTACCTGGCCTGTTCTTGACGCTGAGGCGCGAAAGCTAGGGGAGCAAACGGGATTAGA
## 5      GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGGGTGATGAAGGCCCTAGGGTTGTAAAGCCCTTTCAGCGGGGAAGATAATGACGGTACCCGCAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTCTAAGTCGGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTCGATACTGGAAAGCTCGAGTCCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGGTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## 6      GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTTGTGCGGGAAGATAATGACGGTACCGCAAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATCACTGGGCGTAAAGGGTGCGTAGGCGGGTCTTTAAGTCAGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTTGATACTGAAGATCTTGAGTTCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGATACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACAGGATTAGA
##   abundance forward reverse nmatch nmismatch nindel prefer accept
## 1       370       1       1    119         0      0      1   TRUE
## 2       250       2       4    123         0      0      1   TRUE
## 3       228       3       8    124         0      0      1   TRUE
## 4       207       7       6    124         0      0      1   TRUE
## 5       192       4       3    124         0      0      1   TRUE
## 6       173       6       2    124         0      0      1   TRUE

Create Raw ASV Count Table

# Create the ASV Count Table 
raw_ASV_table <- makeSequenceTable(merged_ASVs)

# Write out the file to data/01_DADA2


# Check the type and dimensions of the data
dim(raw_ASV_table)
## [1]   60 7308
class(raw_ASV_table)
## [1] "matrix" "array"
typeof(raw_ASV_table)
## [1] "integer"
# Inspect the distribution of sequence lengths of all ASVs in dataset 
table(nchar(getSequences(raw_ASV_table)))
## 
##  271  342  377  378  388  399  406  409  410  411  412  415  416  417  418  419 
##    1    1    1    2    3    4    1    1    1    8    4    1   71  972  871  330 
##  420  421  422  423  424  425  426  427  428  429  430  431  432  433  434  435 
##  158   22  264   32   30   25   16   16   21   10    8   31   37   28   97   48 
##  436  437  438  439  440  441  442  443  444  445  450  451  452  453  454  463 
##  221  754   51   42  168  679 1324  878   56    4    3    1    5    2    3    2
# Inspect the distribution of sequence lengths of all ASVs in dataset 
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Raw distribution of ASV length")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###################################################
###################################################
# TRIM THE ASVS
# Let's trim the ASVs to only be the right size, which is 249.
# 249 originates from our expected amplicon of 252 - 3bp in the forward read due to low quality.

# We will allow for a few 
raw_ASV_table_trimmed <- raw_ASV_table[,nchar(colnames(raw_ASV_table)) %in% 400:450]

# Inspect the distribution of sequence lengths of all ASVs in dataset 
table(nchar(getSequences(raw_ASV_table_trimmed)))
## 
##  406  409  410  411  412  415  416  417  418  419  420  421  422  423  424  425 
##    1    1    1    8    4    1   71  972  871  330  158   22  264   32   30   25 
##  426  427  428  429  430  431  432  433  434  435  436  437  438  439  440  441 
##   16   16   21   10    8   31   37   28   97   48  221  754   51   42  168  679 
##  442  443  444  445  450 
## 1324  878   56    4    3
# What proportion is left of the sequences? 
sum(raw_ASV_table_trimmed)/sum(raw_ASV_table)
## [1] 0.9995526
# Inspect the distribution of sequence lengths of all ASVs in dataset 
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Trimmed distribution of ASV length")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Note the peak at 249 is ABOVE 3000

# Let's zoom in on the plot 
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Trimmed distribution of ASV length") + 
  scale_y_continuous(limits = c(0, 500))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Remove Chimeras

Sometimes chimeras arise in our workflow.

Chimeric sequences are artificial sequences formed by the combination of two or more distinct biological sequences. These chimeric sequences can arise during the polymerase chain reaction (PCR) amplification step of the 16S rRNA gene, where fragments from different templates can be erroneously joined together.

Chimera removal is an essential step in the analysis of 16S sequencing data to improve the accuracy of downstream analyses, such as taxonomic assignment and diversity assessment. It helps to avoid the inclusion of misleading or spurious sequences that could lead to incorrect biological interpretations.

# Remove the chimeras in the raw ASV table
noChimeras_ASV_table <- removeBimeraDenovo(raw_ASV_table_trimmed, 
                                           method="consensus", 
                                           multithread=TRUE, verbose=TRUE)
## Identified 776 bimeras out of 7283 input sequences.
# Check the dimensions
dim(noChimeras_ASV_table)
## [1]   60 6507
# What proportion is left of the sequences? 
sum(noChimeras_ASV_table)/sum(raw_ASV_table_trimmed)
## [1] 0.9844314
sum(noChimeras_ASV_table)/sum(raw_ASV_table)
## [1] 0.983991
# Plot it 
data.frame(Seq_Length_NoChim = nchar(getSequences(noChimeras_ASV_table))) %>%
  ggplot(aes(x = Seq_Length_NoChim )) + 
  geom_histogram()+ 
  labs(title = "Trimmed + Chimera Removal distribution of ASV length")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Note the difference in the peak at 249, which is now BELOW 3000

Track the read counts

Here, we will look at the number of reads that were lost in the filtering, denoising, merging, and chimera removal.

# A little function to identify number seqs 
getN <- function(x) sum(getUniques(x))

# Make the table to track the seqs 
track <- cbind(filtered_reads, 
               sapply(dada_forward, getN),
               sapply(dada_reverse, getN),
               sapply(merged_ASVs, getN),
               rowSums(noChimeras_ASV_table))

head(track)
##                        reads.in reads.out                        
## SRR11364336_1.fastq.gz    49102     49102 41856 42877 23454 23135
## SRR11364337_1.fastq.gz    59949     59949 52876 53369 31060 30696
## SRR11364338_1.fastq.gz    51272     51272 43791 44513 24011 23697
## SRR11364339_1.fastq.gz    54283     54283 46173 47553 25753 25578
## SRR11364340_1.fastq.gz    41960     41960 35805 36067 20194 19849
## SRR11364341_1.fastq.gz    63891     63891 54406 56399 31614 31033
# Update column names to be more informative (most are missing at the moment!)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")
rownames(track) <- samples

# Generate a dataframe to track the reads through our DADA2 pipeline
track_counts_df <- 
  track %>%
  # make it a dataframe
  as.data.frame() %>%
  rownames_to_column(var = "names") %>%
  mutate(perc_reads_retained = 100 * nochim / input)

# Visualize it in table format 
DT::datatable(track_counts_df)
# Plot it!
track_counts_df %>%
  pivot_longer(input:nochim, names_to = "read_type", values_to = "num_reads") %>%
  mutate(read_type = fct_relevel(read_type, 
                                 "input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")) %>%
  ggplot(aes(x = read_type, y = num_reads, fill = read_type)) + 
  geom_line(aes(group = names), color = "grey") + 
  geom_point(shape = 21, size = 3, alpha = 0.8) + 
  scale_fill_brewer(palette = "Spectral") + 
  labs(x = "Filtering Step", y = "Number of Sequences") + 
  theme_bw()

Prepare the data for export!

1. ASV Table

Below, we will prepare the following:

  1. Two ASV Count tables:
    1. With ASV seqs: ASV headers include the entire ASV sequence ~250bps.
    2. with ASV names: This includes re-written and shortened headers like ASV_1, ASV_2, etc, which will match the names in our fasta file below.
  2. ASV_fastas: A fasta file that we can use to build a tree for phylogenetic analyses (e.g. phylogenetic alpha diversity metrics or UNIFRAC dissimilarty).

Finalize ASV Count Tables

########### 2. COUNT TABLE ###############
############## Modify the ASV names and then save a fasta file!  ############## 
# Give headers more manageable names
# First pull the ASV sequences
asv_seqs <- colnames(noChimeras_ASV_table)
asv_seqs[1:5]
## [1] "GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGAGGGATGACGGCCTTCGGGTTGTAAACCTCTTTCAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGGATTAGA"
## [2] "GCAGCAGTGGGGAATATTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTCGGGTTGTAAAGCTCTTTTGCCAGGGACGATGATGACGGTACCTGGAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCTTATCAAGTCAGGCGTGAAATTCCCGGGCTCAACCTGGGGGCTGCGCTTGATACTGATGAGCTTGAATGCGGGAGAGGATAGTGGAATTCCCAGTGTAGAGGTGAAATTCGTAGATATTGGGAAGAACACCGGTGGCGAAGGCGGCTATCTGGCCCGTAATTGACGCTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGA"     
## [3] "GCAGCAGTGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTAGGGTTGTAAAGCTCTTTTACCCGGGATGATAATGACAGTACCGGGAGAATAAGCCCCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTCCGGAACTGCCTTTAAGACTGCATCGCTAGAATTGTGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGACTCACTGGACACATATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA"     
## [4] "GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGGGTGATGAAGGCCCTAGGGTTGTAAAGCCCTTTCAGCGGGGAAGATAATGACGGTACCCGCAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTCTAAGTCGGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTCGATACTGGAAAGCTCGAGTCCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGGTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA"     
## [5] "GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTTGTGCGGGAAGATAATGACGGTACCGCAAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATCACTGGGCGTAAAGGGTGCGTAGGCGGGTCTTTAAGTCAGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTTGATACTGAAGATCTTGAGTTCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGATACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACAGGATTAGA"
# make headers for our ASV seq fasta file, which will be our asv names
asv_headers <- vector(dim(noChimeras_ASV_table)[2], mode = "character")
asv_headers[1:5]
## [1] "" "" "" "" ""
# loop through vector and fill it in with ASV names 
for (i in 1:dim(noChimeras_ASV_table)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep = "_")
}

# intitution check
asv_headers[1:5]
## [1] ">ASV_1" ">ASV_2" ">ASV_3" ">ASV_4" ">ASV_5"
##### Rename ASVs in table then write out our ASV fasta file! 
#View(noChimeras_ASV_table)
asv_tab <- t(noChimeras_ASV_table)
#View(asv_tab)

## Rename our asvs! 
row.names(asv_tab) <- sub(">", "", asv_headers)
#View(asv_tab)

Assign Taxonomy

Here, we will use the silva database version 138!

# Classify the ASVs against a reference set using the RDP Naive Bayesian Classifier described by Wang et al., (2007) in AEM
taxa_train <- 
  assignTaxonomy(noChimeras_ASV_table, 
                 "/workdir/in_class_data/taxonomy/silva_nr99_v138.1_train_set.fa.gz", 
                 multithread=5)

# Add the genus/species information 
taxa_addSpecies <- 
  addSpecies(taxa_train, 
             "/workdir/in_class_data/taxonomy/silva_species_assignment_v138.1.fa.gz")

# Inspect the taxonomy 
taxa_print <- taxa_addSpecies # Removing sequence rownames for display only
rownames(taxa_print) <- NULL
#View(taxa_print)

2. Taxonomy Table

# Inspect the taxonomy table
#View(taxa_addSpecies)

##### Prepare tax table 
# Add the ASV sequences from the rownames to a column 
new_tax_tab <- 
  taxa_addSpecies%>%
  as.data.frame() %>%
  rownames_to_column(var = "ASVseqs") 
head(new_tax_tab)
##                                                                                                                                                                                                                                                                                                                                                                                                                                  ASVseqs
## 1 GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGAGGGATGACGGCCTTCGGGTTGTAAACCTCTTTCAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGGATTAGA
## 2      GCAGCAGTGGGGAATATTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTCGGGTTGTAAAGCTCTTTTGCCAGGGACGATGATGACGGTACCTGGAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCTTATCAAGTCAGGCGTGAAATTCCCGGGCTCAACCTGGGGGCTGCGCTTGATACTGATGAGCTTGAATGCGGGAGAGGATAGTGGAATTCCCAGTGTAGAGGTGAAATTCGTAGATATTGGGAAGAACACCGGTGGCGAAGGCGGCTATCTGGCCCGTAATTGACGCTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## 3      GCAGCAGTGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTAGGGTTGTAAAGCTCTTTTACCCGGGATGATAATGACAGTACCGGGAGAATAAGCCCCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTCCGGAACTGCCTTTAAGACTGCATCGCTAGAATTGTGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGACTCACTGGACACATATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## 4      GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGGGTGATGAAGGCCCTAGGGTTGTAAAGCCCTTTCAGCGGGGAAGATAATGACGGTACCCGCAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTCTAAGTCGGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTCGATACTGGAAAGCTCGAGTCCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGGTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## 5      GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTTGTGCGGGAAGATAATGACGGTACCGCAAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATCACTGGGCGTAAAGGGTGCGTAGGCGGGTCTTTAAGTCAGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTTGATACTGAAGATCTTGAGTTCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGATACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACAGGATTAGA
## 6     GCAGCAGTCGGGAATTTTGCCCAATGGACGAAAGTCTGAGGCAGCAACTCCGCGTGAGGGATGAAGGCCTTCGGGTCGTAAACCTCTTTTCCCAGGGAAGATCCAAGACGGTACCTGGGGAATAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGCGCGCGCAGGCGGCTGGCCAAGTCCGATGTGAAAGCTTCCGGCTTAACTGGAAAACGGCATCGGATACTGGTCGGCTGGAAGGTGGGAGAGGGTAGCGGAATTCCCGGTGTAGTGGTGAAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAAGCGGCTACCTGGCCCACTCTTGACGCTGAGGCGCGAAAGCTAGGGGAGCAAACGGGATTAGA
##    Kingdom           Phylum               Class            Order
## 1 Bacteria Actinobacteriota      Actinobacteria    Micrococcales
## 2 Bacteria   Proteobacteria Alphaproteobacteria        Dongiales
## 3 Bacteria   Proteobacteria Alphaproteobacteria Sphingomonadales
## 4 Bacteria   Proteobacteria Alphaproteobacteria      Rhizobiales
## 5 Bacteria   Proteobacteria Alphaproteobacteria      Rhizobiales
## 6 Bacteria      Chloroflexi         Gitt-GS-136             <NA>
##              Family             Genus Species
## 1    Micrococcaceae Pseudarthrobacter    <NA>
## 2        Dongiaceae            Dongia    <NA>
## 3 Sphingomonadaceae      Sphingomonas  jaspsi
## 4 Xanthobacteraceae      Pseudolabrys    <NA>
## 5 Xanthobacteraceae    Bradyrhizobium    <NA>
## 6              <NA>              <NA>    <NA>
# intution check 
stopifnot(new_tax_tab$ASVseqs == colnames(noChimeras_ASV_table))

# Now let's add the ASV names 
rownames(new_tax_tab) <- rownames(asv_tab)
head(new_tax_tab)
##                                                                                                                                                                                                                                                                                                                                                                                                                                      ASVseqs
## ASV_1 GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGAGGGATGACGGCCTTCGGGTTGTAAACCTCTTTCAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGGATTAGA
## ASV_2      GCAGCAGTGGGGAATATTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTCGGGTTGTAAAGCTCTTTTGCCAGGGACGATGATGACGGTACCTGGAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCTTATCAAGTCAGGCGTGAAATTCCCGGGCTCAACCTGGGGGCTGCGCTTGATACTGATGAGCTTGAATGCGGGAGAGGATAGTGGAATTCCCAGTGTAGAGGTGAAATTCGTAGATATTGGGAAGAACACCGGTGGCGAAGGCGGCTATCTGGCCCGTAATTGACGCTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## ASV_3      GCAGCAGTGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTAGGGTTGTAAAGCTCTTTTACCCGGGATGATAATGACAGTACCGGGAGAATAAGCCCCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTCCGGAACTGCCTTTAAGACTGCATCGCTAGAATTGTGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGACTCACTGGACACATATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## ASV_4      GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGGGTGATGAAGGCCCTAGGGTTGTAAAGCCCTTTCAGCGGGGAAGATAATGACGGTACCCGCAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTCTAAGTCGGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTCGATACTGGAAAGCTCGAGTCCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGGTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## ASV_5      GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTTGTGCGGGAAGATAATGACGGTACCGCAAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATCACTGGGCGTAAAGGGTGCGTAGGCGGGTCTTTAAGTCAGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTTGATACTGAAGATCTTGAGTTCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGATACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACAGGATTAGA
## ASV_6     GCAGCAGTCGGGAATTTTGCCCAATGGACGAAAGTCTGAGGCAGCAACTCCGCGTGAGGGATGAAGGCCTTCGGGTCGTAAACCTCTTTTCCCAGGGAAGATCCAAGACGGTACCTGGGGAATAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGCGCGCGCAGGCGGCTGGCCAAGTCCGATGTGAAAGCTTCCGGCTTAACTGGAAAACGGCATCGGATACTGGTCGGCTGGAAGGTGGGAGAGGGTAGCGGAATTCCCGGTGTAGTGGTGAAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAAGCGGCTACCTGGCCCACTCTTGACGCTGAGGCGCGAAAGCTAGGGGAGCAAACGGGATTAGA
##        Kingdom           Phylum               Class            Order
## ASV_1 Bacteria Actinobacteriota      Actinobacteria    Micrococcales
## ASV_2 Bacteria   Proteobacteria Alphaproteobacteria        Dongiales
## ASV_3 Bacteria   Proteobacteria Alphaproteobacteria Sphingomonadales
## ASV_4 Bacteria   Proteobacteria Alphaproteobacteria      Rhizobiales
## ASV_5 Bacteria   Proteobacteria Alphaproteobacteria      Rhizobiales
## ASV_6 Bacteria      Chloroflexi         Gitt-GS-136             <NA>
##                  Family             Genus Species
## ASV_1    Micrococcaceae Pseudarthrobacter    <NA>
## ASV_2        Dongiaceae            Dongia    <NA>
## ASV_3 Sphingomonadaceae      Sphingomonas  jaspsi
## ASV_4 Xanthobacteraceae      Pseudolabrys    <NA>
## ASV_5 Xanthobacteraceae    Bradyrhizobium    <NA>
## ASV_6              <NA>              <NA>    <NA>
### Final prep of tax table. Add new column with ASV names 
asv_tax <- 
  new_tax_tab %>%
  # add rownames from count table for phyloseq handoff
  mutate(ASV = rownames(asv_tab)) %>%
  # Resort the columns with select
  dplyr::select(Kingdom, Phylum, Class, Order, Family, Genus, Species, ASV, ASVseqs)

head(asv_tax)
##        Kingdom           Phylum               Class            Order
## ASV_1 Bacteria Actinobacteriota      Actinobacteria    Micrococcales
## ASV_2 Bacteria   Proteobacteria Alphaproteobacteria        Dongiales
## ASV_3 Bacteria   Proteobacteria Alphaproteobacteria Sphingomonadales
## ASV_4 Bacteria   Proteobacteria Alphaproteobacteria      Rhizobiales
## ASV_5 Bacteria   Proteobacteria Alphaproteobacteria      Rhizobiales
## ASV_6 Bacteria      Chloroflexi         Gitt-GS-136             <NA>
##                  Family             Genus Species   ASV
## ASV_1    Micrococcaceae Pseudarthrobacter    <NA> ASV_1
## ASV_2        Dongiaceae            Dongia    <NA> ASV_2
## ASV_3 Sphingomonadaceae      Sphingomonas  jaspsi ASV_3
## ASV_4 Xanthobacteraceae      Pseudolabrys    <NA> ASV_4
## ASV_5 Xanthobacteraceae    Bradyrhizobium    <NA> ASV_5
## ASV_6              <NA>              <NA>    <NA> ASV_6
##                                                                                                                                                                                                                                                                                                                                                                                                                                      ASVseqs
## ASV_1 GCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCGACGCCGCGTGAGGGATGACGGCCTTCGGGTTGTAAACCTCTTTCAGTAGGGAAGAAGCGAAAGTGACGGTACCTGCAGAAGAAGCGCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGCGCAAGCGTTATCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGCCGTGAAAGTCCGGGGCTCAACTCCGGATCTGCGGTGGGTACGGGCAGACTAGAGTGATGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGAAATGCGCAGATATCAGGAGGAACACCGATGGCGAAGGCAGGTCTCTGGGCATTAACTGACGCTGAGGAGCGAAAGCATGGGGAGCGAACAGGATTAGA
## ASV_2      GCAGCAGTGGGGAATATTGGACAATGGGGGCAACCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTCGGGTTGTAAAGCTCTTTTGCCAGGGACGATGATGACGGTACCTGGAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGCTTATCAAGTCAGGCGTGAAATTCCCGGGCTCAACCTGGGGGCTGCGCTTGATACTGATGAGCTTGAATGCGGGAGAGGATAGTGGAATTCCCAGTGTAGAGGTGAAATTCGTAGATATTGGGAAGAACACCGGTGGCGAAGGCGGCTATCTGGCCCGTAATTGACGCTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## ASV_3      GCAGCAGTGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCAATGCCGCGTGAGTGATGAAGGCCTTAGGGTTGTAAAGCTCTTTTACCCGGGATGATAATGACAGTACCGGGAGAATAAGCCCCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGCTTTGTAAGTTAGAGGTGAAAGCCCGGAGCTCAACTCCGGAACTGCCTTTAAGACTGCATCGCTAGAATTGTGGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAAGAACACCAGTGGCGAAGGCGACTCACTGGACACATATTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## ASV_4      GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGGGTGATGAAGGCCCTAGGGTTGTAAAGCCCTTTCAGCGGGGAAGATAATGACGGTACCCGCAGAAGAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTCTAAGTCGGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTCGATACTGGAAAGCTCGAGTCCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGGTACTGACGCTGAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGA
## ASV_5      GCAGCAGTGGGGAATATTGGACAATGGGCGCAAGCCTGATCCAGCCATGCCGCGTGAGTGATGAAGGCCCTAGGGTTGTAAAGCTCTTTTGTGCGGGAAGATAATGACGGTACCGCAAGAATAAGCCCCGGCTAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGGGGCTAGCGTTGCTCGGAATCACTGGGCGTAAAGGGTGCGTAGGCGGGTCTTTAAGTCAGGGGTGAAATCCTGGAGCTCAACTCCAGAACTGCCTTTGATACTGAAGATCTTGAGTTCGGGAGAGGTGAGTGGAACTGCGAGTGTAGAGGTGAAATTCGTAGATATTCGCAAGAACACCAGTGGCGAAGGCGGCTCACTGGCCCGATACTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACAGGATTAGA
## ASV_6     GCAGCAGTCGGGAATTTTGCCCAATGGACGAAAGTCTGAGGCAGCAACTCCGCGTGAGGGATGAAGGCCTTCGGGTCGTAAACCTCTTTTCCCAGGGAAGATCCAAGACGGTACCTGGGGAATAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTGTCCGGATTTACTGGGCGTAAAGCGCGCGCAGGCGGCTGGCCAAGTCCGATGTGAAAGCTTCCGGCTTAACTGGAAAACGGCATCGGATACTGGTCGGCTGGAAGGTGGGAGAGGGTAGCGGAATTCCCGGTGTAGTGGTGAAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAAGCGGCTACCTGGCCCACTCTTGACGCTGAGGCGCGAAAGCTAGGGGAGCAAACGGGATTAGA
# Intution check
stopifnot(asv_tax$ASV == rownames(asv_tax), rownames(asv_tax) == rownames(asv_tab))

Write 01_DADA2 files

Now, we will write the files! We will write the following to the data/01_DADA2/ folder. We will save both as files that could be submitted as supplements AND as .RData objects for easy loading into the next steps into R.:

  1. ASV_counts.tsv: ASV count table that has ASV names that are re-written and shortened headers like ASV_1, ASV_2, etc, which will match the names in our fasta file below. This will also be saved as data/01_DADA2/ASV_counts.RData.
  2. ASV_counts_withSeqNames.tsv: This is generated with the data object in this file known as noChimeras_ASV_table. ASV headers include the entire ASV sequence ~250bps. In addition, we will save this as a .RData object as data/01_DADA2/noChimeras_ASV_table.RData as we will use this data in analysis/02_Taxonomic_Assignment.Rmd to assign the taxonomy from the sequence headers.
  3. ASVs.fasta: A fasta file output of the ASV names from ASV_counts.tsv and the sequences from the ASVs in ASV_counts_withSeqNames.tsv. A fasta file that we can use to build a tree for phylogenetic analyses (e.g. phylogenetic alpha diversity metrics or UNIFRAC dissimilarty).
  4. We will also make a copy of ASVs.fasta in data/02_TaxAss_FreshTrain/ to be used for the taxonomy classification in the next step in the workflow.
  5. Write out the taxonomy table
  6. track_read_counts.RData: To track how many reads we lost throughout our workflow that could be used and plotted later. We will add this to the metadata in analysis/02_Taxonomic_Assignment.Rmd.
# FIRST, we will save our output as regular files, which will be useful later on. 
# Save to regular .tsv file 
# Write BOTH the modified and unmodified ASV tables to a file!
# Write count table with ASV numbered names (e.g. ASV_1, ASV_2, etc)
write.table(asv_tab, "data/01_DADA2/ASV_counts.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write count table with ASV sequence names
write.table(noChimeras_ASV_table, "data/01_DADA2/ASV_counts_withSeqNames.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write out the fasta file for reference later on for what seq matches what ASV
asv_fasta <- c(rbind(asv_headers, asv_seqs))
# Save to a file!
write(asv_fasta, "data/01_DADA2/ASVs.fasta")


# SECOND, let's save the taxonomy tables 
# Write the table 
write.table(asv_tax, "data/01_DADA2/ASV_taxonomy.tsv", sep = "\t", quote = FALSE, col.names = NA)


# THIRD, let's save to a RData object 
# Each of these files will be used in the analysis/02_Taxonomic_Assignment
# RData objects are for easy loading :) 
save(noChimeras_ASV_table, file = "data/01_DADA2/noChimeras_ASV_table.RData")
save(asv_tab, file = "data/01_DADA2/ASV_counts.RData")
# And save the track_counts_df a R object, which we will merge with metadata information in the next step of the analysis in nalysis/02_Taxonomic_Assignment. 
save(track_counts_df, file = "data/01_DADA2/track_read_counts.RData")

Check Render Time

# Take the time now that we are at the end of the script
end_time <- Sys.time()
end_time 
## [1] "2024-05-02 11:52:53 EDT"
# Echo the elapsed time
elapsed_time <- round((end_time - start_time), 3)
elapsed_time
## Time difference of 2.364 hours

#Conclusion Working on this file helped me understand how data looks before cleaning and after. I realized how important it is to work with different components of filter and trim function.

Session Information

# Ensure reproducibility 
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.3.2 (2023-10-31)
##  os       Rocky Linux 9.0 (Blue Onyx)
##  system   x86_64, linux-gnu
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       America/New_York
##  date     2024-05-02
##  pandoc   3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package              * version    date (UTC) lib source
##  abind                  1.4-5      2016-07-21 [2] CRAN (R 4.3.2)
##  ade4                   1.7-22     2023-02-06 [1] CRAN (R 4.3.2)
##  ape                    5.7-1      2023-03-13 [2] CRAN (R 4.3.2)
##  Biobase                2.62.0     2023-10-24 [2] Bioconductor
##  BiocGenerics           0.48.1     2023-11-01 [2] Bioconductor
##  BiocManager          * 1.30.22    2023-08-08 [2] CRAN (R 4.3.2)
##  BiocParallel           1.36.0     2023-10-24 [2] Bioconductor
##  biomformat             1.30.0     2023-10-24 [1] Bioconductor
##  Biostrings             2.70.1     2023-10-25 [2] Bioconductor
##  bitops                 1.0-7      2021-04-24 [2] CRAN (R 4.3.2)
##  bslib                  0.5.1      2023-08-11 [2] CRAN (R 4.3.2)
##  cachem                 1.0.8      2023-05-01 [2] CRAN (R 4.3.2)
##  callr                  3.7.3      2022-11-02 [2] CRAN (R 4.3.2)
##  cli                    3.6.1      2023-03-23 [2] CRAN (R 4.3.2)
##  cluster                2.1.4      2022-08-22 [2] CRAN (R 4.3.2)
##  codetools              0.2-19     2023-02-01 [2] CRAN (R 4.3.2)
##  colorspace             2.1-0      2023-01-23 [2] CRAN (R 4.3.2)
##  crayon                 1.5.2      2022-09-29 [2] CRAN (R 4.3.2)
##  crosstalk              1.2.0      2021-11-04 [2] CRAN (R 4.3.2)
##  curl                   5.1.0      2023-10-02 [2] CRAN (R 4.3.2)
##  dada2                * 1.30.0     2023-10-24 [1] Bioconductor
##  data.table             1.14.8     2023-02-17 [2] CRAN (R 4.3.2)
##  DelayedArray           0.28.0     2023-10-24 [2] Bioconductor
##  deldir                 1.0-9      2023-05-17 [2] CRAN (R 4.3.2)
##  devtools             * 2.4.4      2022-07-20 [2] CRAN (R 4.2.1)
##  digest                 0.6.33     2023-07-07 [2] CRAN (R 4.3.2)
##  dplyr                * 1.1.3      2023-09-03 [2] CRAN (R 4.3.2)
##  DT                   * 0.32       2024-02-19 [1] CRAN (R 4.3.2)
##  ellipsis               0.3.2      2021-04-29 [2] CRAN (R 4.3.2)
##  evaluate               0.23       2023-11-01 [2] CRAN (R 4.3.2)
##  fansi                  1.0.5      2023-10-08 [2] CRAN (R 4.3.2)
##  farver                 2.1.1      2022-07-06 [2] CRAN (R 4.3.2)
##  fastmap                1.1.1      2023-02-24 [2] CRAN (R 4.3.2)
##  forcats              * 1.0.0      2023-01-29 [1] CRAN (R 4.3.2)
##  foreach                1.5.2      2022-02-02 [2] CRAN (R 4.3.2)
##  fs                     1.6.3      2023-07-20 [2] CRAN (R 4.3.2)
##  generics               0.1.3      2022-07-05 [2] CRAN (R 4.3.2)
##  GenomeInfoDb           1.38.0     2023-10-24 [2] Bioconductor
##  GenomeInfoDbData       1.2.11     2023-11-07 [2] Bioconductor
##  GenomicAlignments      1.38.0     2023-10-24 [2] Bioconductor
##  GenomicRanges          1.54.1     2023-10-29 [2] Bioconductor
##  ggplot2              * 3.5.0      2024-02-23 [2] CRAN (R 4.3.2)
##  glue                   1.6.2      2022-02-24 [2] CRAN (R 4.3.2)
##  gtable                 0.3.4      2023-08-21 [2] CRAN (R 4.3.2)
##  highr                  0.10       2022-12-22 [2] CRAN (R 4.3.2)
##  hms                    1.1.3      2023-03-21 [1] CRAN (R 4.3.2)
##  htmltools              0.5.7      2023-11-03 [2] CRAN (R 4.3.2)
##  htmlwidgets            1.6.2      2023-03-17 [2] CRAN (R 4.3.2)
##  httpuv                 1.6.12     2023-10-23 [2] CRAN (R 4.3.2)
##  hwriter                1.3.2.1    2022-04-08 [1] CRAN (R 4.3.2)
##  igraph                 1.5.1      2023-08-10 [2] CRAN (R 4.3.2)
##  iNEXT                * 3.0.0      2022-08-29 [1] CRAN (R 4.3.2)
##  interp                 1.1-6      2024-01-26 [1] CRAN (R 4.3.2)
##  IRanges                2.36.0     2023-10-24 [2] Bioconductor
##  iterators              1.0.14     2022-02-05 [2] CRAN (R 4.3.2)
##  jpeg                   0.1-10     2022-11-29 [1] CRAN (R 4.3.2)
##  jquerylib              0.1.4      2021-04-26 [2] CRAN (R 4.3.2)
##  jsonlite               1.8.7      2023-06-29 [2] CRAN (R 4.3.2)
##  knitr                  1.45       2023-10-30 [2] CRAN (R 4.3.2)
##  labeling               0.4.3      2023-08-29 [2] CRAN (R 4.3.2)
##  later                  1.3.1      2023-05-02 [2] CRAN (R 4.3.2)
##  lattice              * 0.21-9     2023-10-01 [2] CRAN (R 4.3.2)
##  latticeExtra           0.6-30     2022-07-04 [1] CRAN (R 4.3.2)
##  lifecycle              1.0.3      2022-10-07 [2] CRAN (R 4.3.2)
##  lubridate            * 1.9.3      2023-09-27 [1] CRAN (R 4.3.2)
##  magrittr               2.0.3      2022-03-30 [2] CRAN (R 4.3.2)
##  MASS                   7.3-60     2023-05-04 [2] CRAN (R 4.3.2)
##  Matrix                 1.6-1.1    2023-09-18 [2] CRAN (R 4.3.2)
##  MatrixGenerics         1.14.0     2023-10-24 [2] Bioconductor
##  matrixStats            1.1.0      2023-11-07 [2] CRAN (R 4.3.2)
##  memoise                2.0.1      2021-11-26 [2] CRAN (R 4.3.2)
##  mgcv                   1.9-0      2023-07-11 [2] CRAN (R 4.3.2)
##  mime                   0.12       2021-09-28 [2] CRAN (R 4.3.2)
##  miniUI                 0.1.1.1    2018-05-18 [2] CRAN (R 4.3.2)
##  multtest               2.58.0     2023-10-24 [1] Bioconductor
##  munsell                0.5.0      2018-06-12 [2] CRAN (R 4.3.2)
##  nlme                   3.1-163    2023-08-09 [2] CRAN (R 4.3.2)
##  pacman                 0.5.1      2019-03-11 [1] CRAN (R 4.3.2)
##  patchwork            * 1.2.0.9000 2024-03-12 [1] Github (thomasp85/patchwork@d943757)
##  permute              * 0.9-7      2022-01-27 [1] CRAN (R 4.3.2)
##  phyloseq             * 1.46.0     2023-10-24 [1] Bioconductor
##  pillar                 1.9.0      2023-03-22 [2] CRAN (R 4.3.2)
##  pkgbuild               1.4.2      2023-06-26 [2] CRAN (R 4.3.2)
##  pkgconfig              2.0.3      2019-09-22 [2] CRAN (R 4.3.2)
##  pkgload                1.3.3      2023-09-22 [2] CRAN (R 4.3.2)
##  plyr                   1.8.9      2023-10-02 [2] CRAN (R 4.3.2)
##  png                    0.1-8      2022-11-29 [2] CRAN (R 4.3.2)
##  prettyunits            1.2.0      2023-09-24 [2] CRAN (R 4.3.2)
##  processx               3.8.2      2023-06-30 [2] CRAN (R 4.3.2)
##  profvis                0.3.8      2023-05-02 [2] CRAN (R 4.3.2)
##  promises               1.2.1      2023-08-10 [2] CRAN (R 4.3.2)
##  ps                     1.7.5      2023-04-18 [2] CRAN (R 4.3.2)
##  purrr                * 1.0.2      2023-08-10 [2] CRAN (R 4.3.2)
##  R6                     2.5.1      2021-08-19 [2] CRAN (R 4.3.2)
##  RColorBrewer           1.1-3      2022-04-03 [2] CRAN (R 4.3.2)
##  Rcpp                 * 1.0.11     2023-07-06 [2] CRAN (R 4.3.2)
##  RcppParallel           5.1.7      2023-02-27 [2] CRAN (R 4.3.2)
##  RCurl                  1.98-1.13  2023-11-02 [2] CRAN (R 4.3.2)
##  readr                * 2.1.5      2024-01-10 [1] CRAN (R 4.3.2)
##  remotes                2.4.2.1    2023-07-18 [2] CRAN (R 4.3.2)
##  reshape2               1.4.4      2020-04-09 [2] CRAN (R 4.3.2)
##  rhdf5                  2.46.1     2023-11-29 [1] Bioconductor 3.18 (R 4.3.2)
##  rhdf5filters           1.14.1     2023-11-06 [1] Bioconductor
##  Rhdf5lib               1.24.2     2024-02-07 [1] Bioconductor 3.18 (R 4.3.2)
##  rlang                  1.1.2      2023-11-04 [2] CRAN (R 4.3.2)
##  rmarkdown              2.25       2023-09-18 [2] CRAN (R 4.3.2)
##  Rsamtools              2.18.0     2023-10-24 [2] Bioconductor
##  rstudioapi             0.15.0     2023-07-07 [2] CRAN (R 4.3.2)
##  S4Arrays               1.2.0      2023-10-24 [2] Bioconductor
##  S4Vectors              0.40.1     2023-10-26 [2] Bioconductor
##  sass                   0.4.7      2023-07-15 [2] CRAN (R 4.3.2)
##  scales                 1.3.0      2023-11-28 [2] CRAN (R 4.3.2)
##  sessioninfo            1.2.2      2021-12-06 [2] CRAN (R 4.3.2)
##  shiny                  1.7.5.1    2023-10-14 [2] CRAN (R 4.3.2)
##  ShortRead              1.60.0     2023-10-24 [1] Bioconductor
##  SparseArray            1.2.1      2023-11-05 [2] Bioconductor
##  stringi                1.7.12     2023-01-11 [2] CRAN (R 4.3.2)
##  stringr              * 1.5.0      2022-12-02 [2] CRAN (R 4.3.2)
##  SummarizedExperiment   1.32.0     2023-10-24 [2] Bioconductor
##  survival               3.5-7      2023-08-14 [2] CRAN (R 4.3.2)
##  tibble               * 3.2.1      2023-03-20 [2] CRAN (R 4.3.2)
##  tidyr                * 1.3.0      2023-01-24 [2] CRAN (R 4.3.2)
##  tidyselect             1.2.1      2024-03-11 [1] CRAN (R 4.3.2)
##  tidyverse            * 2.0.0      2023-02-22 [1] CRAN (R 4.3.2)
##  timechange             0.3.0      2024-01-18 [1] CRAN (R 4.3.2)
##  tzdb                   0.4.0      2023-05-12 [1] CRAN (R 4.3.2)
##  urlchecker             1.0.1      2021-11-30 [2] CRAN (R 4.3.2)
##  usethis              * 2.2.2      2023-07-06 [2] CRAN (R 4.3.2)
##  utf8                   1.2.4      2023-10-22 [2] CRAN (R 4.3.2)
##  vctrs                  0.6.4      2023-10-12 [2] CRAN (R 4.3.2)
##  vegan                * 2.6-4      2022-10-11 [1] CRAN (R 4.3.2)
##  withr                  2.5.2      2023-10-30 [2] CRAN (R 4.3.2)
##  xfun                   0.41       2023-11-01 [2] CRAN (R 4.3.2)
##  xtable                 1.8-4      2019-04-21 [2] CRAN (R 4.3.2)
##  XVector                0.42.0     2023-10-24 [2] Bioconductor
##  yaml                   2.3.7      2023-01-23 [2] CRAN (R 4.3.2)
##  zlibbioc               1.48.0     2023-10-24 [2] Bioconductor
## 
##  [1] /home/gk372/R/x86_64-pc-linux-gnu-library/4.3
##  [2] /programs/R-4.3.2/library
## 
## ──────────────────────────────────────────────────────────────────────────────